{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1c69778",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json, os, random, io, pdb\n",
    "from tqdm import tqdm\n",
    "from random import random\n",
    "import numpy as np\n",
    "import glob \n",
    "import pickle as pkl \n",
    "import re "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4337ec19",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_loc = 'data/APPS/test/'\n",
    "problems = glob.glob(test_loc + '*')\n",
    "problems = sorted(problems) # Pin some ordering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fba76cab",
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_in_out(lines):\n",
    "    start_example = False\n",
    "    start_input = False\n",
    "    start_output = False \n",
    "    inputs = []\n",
    "    outputs = []\n",
    "    curr_input = ''\n",
    "    curr_output = ''\n",
    "    for line in lines:\n",
    "        \n",
    "        if len(line.strip())==0: \n",
    "            start_output = False\n",
    "            start_input = False \n",
    "            continue\n",
    "        \n",
    "        line1 = line.lower()\n",
    "        \n",
    "        if '-examples-' in line1 or '-example-' in line1 or '-example -' in line1 or \\\n",
    "            '-example 1-' in line1 or '-example 2-' in line1 or '-example 3-' in line1 or \\\n",
    "            '-example 4-' in line1 or '-example 5-' in line1 or \\\n",
    "            'example:' in line1 or \\\n",
    "            'example 1:' in line1 or 'example 2:' in line1 or '-example 3:' in line1 or \\\n",
    "            'example 4:' in line1 or 'example 5:' in line1:\n",
    "            start_example = True\n",
    "            continue\n",
    "        \n",
    "        if '-note-'.lower() in line1:\n",
    "            start_example = False\n",
    "            start_output = False\n",
    "            start_input = False\n",
    "            continue\n",
    "            \n",
    "        if (start_example and 'Input' in line) or ('-Sample Input' in line) \\\n",
    "            or ('-Example Input' in line) or ('Sample Input:' in line) \\\n",
    "            or ('-Sample input' in line):\n",
    "            start_input = True\n",
    "            start_output = False\n",
    "            \n",
    "            if len(curr_output)>0:\n",
    "                outputs.append(curr_output)\n",
    "                curr_output = ''\n",
    "            \n",
    "            if (not '-sample input' in line1) and (not '-example input' in line1) and (not '-sample input' in line1):\n",
    "                \n",
    "                if 'input:' in line1:\n",
    "                    temp = line1.replace('example','').replace('sample','').replace('input:','')\n",
    "                    if len(temp.strip())>0: \n",
    "                        curr_input = temp        \n",
    "            continue\n",
    "        \n",
    "        if (start_example and 'Output' in line) or ('-Sample Output' in line) \\\n",
    "            or ('-Example Output' in line) or ('Sample Output:' in line) \\\n",
    "            or ('-Sample output' in line):\n",
    "            start_output = True\n",
    "            start_input = False\n",
    "            \n",
    "            if len(curr_input)>0:\n",
    "                inputs.append(curr_input)\n",
    "                curr_input = ''\n",
    "                \n",
    "            if (not '-sample output' in line1) and (not '-example output' in line1) and (not '-sample output' in line1):\n",
    "                if 'output:' in line1:\n",
    "                    temp = line1.replace('example','').replace('sample','').replace('output:','')\n",
    "                    if len(temp.strip())>0: \n",
    "                        curr_output = temp\n",
    "            continue \n",
    "        \n",
    "        if start_input:\n",
    "            curr_input += line \n",
    "        \n",
    "        if start_output:\n",
    "            curr_output += line \n",
    "            \n",
    "    if len(curr_output)>0: \n",
    "        outputs.append(curr_output)\n",
    "        start_output = False\n",
    "            \n",
    "    if len(inputs)==0 or len(inputs) != len(outputs) or (start_output or start_input):\n",
    "        return None\n",
    "        \n",
    "    return {'inputs': inputs, 'outputs': outputs}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19fd8cc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "non_eng = {}\n",
    "no_test = {}\n",
    "in_outs = {}\n",
    "\n",
    "for problem_idx, problem in tqdm(enumerate(problems),total=len(problems)): \n",
    "    problem_idx += start_idx\n",
    "    prompt_path = os.path.join(problem, \"question.txt\")\n",
    "    with open(prompt_path, 'r') as file:\n",
    "        lines = file.readlines()\n",
    "        temp = ''.join(lines)\n",
    "\n",
    "    in_out = find_in_out(lines)\n",
    "    in_outs[problem_idx] = in_out\n",
    "    \n",
    "    # special case with non-English problems\n",
    "    if 'Входные' in temp: \n",
    "        non_eng[problem_idx]=temp\n",
    "    \n",
    "    elif in_outs[problem_idx] is None:\n",
    "        no_test[problem_idx] =temp\n",
    "    \n",
    "    example_test = os.path.join(problem, \"example_input_output.json\")\n",
    "    json.dump(in_out, open(example_test, 'w'))\n",
    "    \n",
    "print(\"Non-English task: {}\".format(len(non_eng))) \n",
    "print(\"Zero-example-test task: {}\".format(len(no_test)))       \n",
    "\n",
    "'''\n",
    "Non-English task: 17\n",
    "Zero-example-test task: 46\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7eda3be",
   "metadata": {},
   "outputs": [],
   "source": [
    "nb_example_tests = []\n",
    "for k,v in in_outs.items(): \n",
    "    if v is None:\n",
    "        nb_example_tests.append(0)\n",
    "    else:\n",
    "        nb_example_tests.append(len(v['inputs']))\n",
    "\n",
    "print(\"Total number of samples: {}\".format(np.array(nb_example_tests).shape))\n",
    "print(\"Average extracted example test cases: {}\".format(np.array(nb_example_tests).mean()))\n",
    "\n",
    "'''\n",
    "Total number of samples: (5000,)\n",
    "Average extracted example test cases: 1.9764\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81b26635",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
